#importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import matplotlib.ticker as mtick
# Set the style and color palette for all plots
sns.set_palette("tab20")
sns.set_style("whitegrid")
# Set the legend position to center left for all plots
plt.rcParams['legend.loc'] = 'center left'
# read data from the .data file
data = pd.read_csv('/Users/ushabhanu/Desktop/TakeHomeProject/census-bureau.data', header=None)
# read column names from the .columns file
with open('/Users/ushabhanu/Desktop/TakeHomeProject/census-bureau.columns', 'r') as f:
columns = f.read().splitlines()
# set the column names
data.columns = columns
# Displaying first five elements of all columns
with pd.option_context('display.max_columns', None):
display(data.head())
| age | class of worker | detailed industry recode | detailed occupation recode | education | wage per hour | enroll in edu inst last wk | marital stat | major industry code | major occupation code | race | hispanic origin | sex | member of a labor union | reason for unemployment | full or part time employment stat | capital gains | capital losses | dividends from stocks | tax filer stat | region of previous residence | state of previous residence | detailed household and family stat | detailed household summary in household | weight | migration code-change in msa | migration code-change in reg | migration code-move within reg | live in this house 1 year ago | migration prev res in sunbelt | num persons worked for employer | family members under 18 | country of birth father | country of birth mother | country of birth self | citizenship | own business or self employed | fill inc questionnaire for veteran's admin | veterans benefits | weeks worked in year | year | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 73 | Not in universe | 0 | 0 | High school graduate | 0 | Not in universe | Widowed | Not in universe or children | Not in universe | White | All other | Female | Not in universe | Not in universe | Not in labor force | 0 | 0 | 0 | Nonfiler | Not in universe | Not in universe | Other Rel 18+ ever marr not in subfamily | Other relative of householder | 1700.09 | ? | ? | ? | Not in universe under 1 year old | ? | 0 | Not in universe | United-States | United-States | United-States | Native- Born in the United States | 0 | Not in universe | 2 | 0 | 95 | - 50000. |
| 1 | 58 | Self-employed-not incorporated | 4 | 34 | Some college but no degree | 0 | Not in universe | Divorced | Construction | Precision production craft & repair | White | All other | Male | Not in universe | Not in universe | Children or Armed Forces | 0 | 0 | 0 | Head of household | South | Arkansas | Householder | Householder | 1053.55 | MSA to MSA | Same county | Same county | No | Yes | 1 | Not in universe | United-States | United-States | United-States | Native- Born in the United States | 0 | Not in universe | 2 | 52 | 94 | - 50000. |
| 2 | 18 | Not in universe | 0 | 0 | 10th grade | 0 | High school | Never married | Not in universe or children | Not in universe | Asian or Pacific Islander | All other | Female | Not in universe | Not in universe | Not in labor force | 0 | 0 | 0 | Nonfiler | Not in universe | Not in universe | Child 18+ never marr Not in a subfamily | Child 18 or older | 991.95 | ? | ? | ? | Not in universe under 1 year old | ? | 0 | Not in universe | Vietnam | Vietnam | Vietnam | Foreign born- Not a citizen of U S | 0 | Not in universe | 2 | 0 | 95 | - 50000. |
| 3 | 9 | Not in universe | 0 | 0 | Children | 0 | Not in universe | Never married | Not in universe or children | Not in universe | White | All other | Female | Not in universe | Not in universe | Children or Armed Forces | 0 | 0 | 0 | Nonfiler | Not in universe | Not in universe | Child <18 never marr not in subfamily | Child under 18 never married | 1758.14 | Nonmover | Nonmover | Nonmover | Yes | Not in universe | 0 | Both parents present | United-States | United-States | United-States | Native- Born in the United States | 0 | Not in universe | 0 | 0 | 94 | - 50000. |
| 4 | 10 | Not in universe | 0 | 0 | Children | 0 | Not in universe | Never married | Not in universe or children | Not in universe | White | All other | Female | Not in universe | Not in universe | Children or Armed Forces | 0 | 0 | 0 | Nonfiler | Not in universe | Not in universe | Child <18 never marr not in subfamily | Child under 18 never married | 1069.16 | Nonmover | Nonmover | Nonmover | Yes | Not in universe | 0 | Both parents present | United-States | United-States | United-States | Native- Born in the United States | 0 | Not in universe | 0 | 0 | 94 | - 50000. |
# Shape of the data
data.shape
(199523, 42)
# Quick overview of the distribution and range of values in each column
data.describe()
| age | detailed industry recode | detailed occupation recode | wage per hour | capital gains | capital losses | dividends from stocks | weight | num persons worked for employer | own business or self employed | veterans benefits | weeks worked in year | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 199523.000000 | 199523.000000 | 199523.000000 | 199523.000000 | 199523.00000 | 199523.000000 | 199523.000000 | 199523.000000 | 199523.000000 | 199523.000000 | 199523.000000 | 199523.000000 | 199523.000000 |
| mean | 34.494199 | 15.352320 | 11.306556 | 55.426908 | 434.71899 | 37.313788 | 197.529533 | 1740.380269 | 1.956180 | 0.175438 | 1.514833 | 23.174897 | 94.499672 |
| std | 22.310895 | 18.067129 | 14.454204 | 274.896454 | 4697.53128 | 271.896428 | 1984.163658 | 993.768156 | 2.365126 | 0.553694 | 0.851473 | 24.411488 | 0.500001 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 37.870000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 94.000000 |
| 25% | 15.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 1061.615000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 94.000000 |
| 50% | 33.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 1618.310000 | 1.000000 | 0.000000 | 2.000000 | 8.000000 | 94.000000 |
| 75% | 50.000000 | 33.000000 | 26.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 2188.610000 | 4.000000 | 0.000000 | 2.000000 | 52.000000 | 95.000000 |
| max | 90.000000 | 51.000000 | 46.000000 | 9999.000000 | 99999.00000 | 4608.000000 | 99999.000000 | 18656.300000 | 6.000000 | 2.000000 | 2.000000 | 52.000000 | 95.000000 |
# Displaying concise summary of the DataFrame
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 199523 entries, 0 to 199522 Data columns (total 42 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 199523 non-null int64 1 class of worker 199523 non-null object 2 detailed industry recode 199523 non-null int64 3 detailed occupation recode 199523 non-null int64 4 education 199523 non-null object 5 wage per hour 199523 non-null int64 6 enroll in edu inst last wk 199523 non-null object 7 marital stat 199523 non-null object 8 major industry code 199523 non-null object 9 major occupation code 199523 non-null object 10 race 199523 non-null object 11 hispanic origin 198649 non-null object 12 sex 199523 non-null object 13 member of a labor union 199523 non-null object 14 reason for unemployment 199523 non-null object 15 full or part time employment stat 199523 non-null object 16 capital gains 199523 non-null int64 17 capital losses 199523 non-null int64 18 dividends from stocks 199523 non-null int64 19 tax filer stat 199523 non-null object 20 region of previous residence 199523 non-null object 21 state of previous residence 199523 non-null object 22 detailed household and family stat 199523 non-null object 23 detailed household summary in household 199523 non-null object 24 weight 199523 non-null float64 25 migration code-change in msa 199523 non-null object 26 migration code-change in reg 199523 non-null object 27 migration code-move within reg 199523 non-null object 28 live in this house 1 year ago 199523 non-null object 29 migration prev res in sunbelt 199523 non-null object 30 num persons worked for employer 199523 non-null int64 31 family members under 18 199523 non-null object 32 country of birth father 199523 non-null object 33 country of birth mother 199523 non-null object 34 country of birth self 199523 non-null object 35 citizenship 199523 non-null object 36 own business or self employed 199523 non-null int64 37 fill inc questionnaire for veteran's admin 199523 non-null object 38 veterans benefits 199523 non-null int64 39 weeks worked in year 199523 non-null int64 40 year 199523 non-null int64 41 label 199523 non-null object dtypes: float64(1), int64(12), object(29) memory usage: 63.9+ MB
As a data scientist, you are tasked by your retail business client with identifying two groups of people for marketing purposes: People who earn an income of less than $50,000 and those who earn more than $50,000. To assist in this pursuit, Walmart has developed a means of accessing 40 different demographic and employment related variables for any person they are interested in marketing to. Additionally, Walmart has been able to compile a dataset that provides gold labels for a variety of observations of these 40 variables within the population. Using the dataset given, train and validate a classifier that predicts this outcome.
Our goal is to develop a predictive model using the provided dataset to assess whether the income level of people in the United States is greater/lesser than $50,000.
It is clear from the problem statement that it is a classification problem, Let's have alook at the target variable
# Looking at the distribution of the target variable
data['label'].value_counts()
- 50000. 187141 50000+. 12382 Name: label, dtype: int64
# Displaying the proportion of label less than $50,000
targetcount = data['label'].value_counts()
income_below_50k_count = float(targetcount['- 50000.'])
percent_below_50k = (income_below_50k_count / len(data['label'])) * 100
print(percent_below_50k, "% People earn an income of less than $50,000")
93.79419916500854 % People earn an income of less than $50,000
# Plot to show the distribution of income label
sns.countplot(x='label', data=data)
# add labels and title
plt.xlabel('Income')
plt.ylabel('Count')
plt.title('Distribution of Label')
Text(0.5, 1.0, 'Distribution of Label')
Data is heavily skewed towards having income less than $50,000
Bias in the model: The model may be better at predicting the income of people who earn less than $50,000 but not as good at predicting the income of people who earn more than $50,000
If a model predicts the majority class for every instance, it may achieve a high accuracy, but it is not useful for the minority class
# Check for duplicate rows
data.duplicated().sum()
3229
# identify duplicate rows
duplicates = data.duplicated()
# print duplicate rows
data[duplicates].head()
| age | class of worker | detailed industry recode | detailed occupation recode | education | wage per hour | enroll in edu inst last wk | marital stat | major industry code | major occupation code | ... | country of birth father | country of birth mother | country of birth self | citizenship | own business or self employed | fill inc questionnaire for veteran's admin | veterans benefits | weeks worked in year | year | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2784 | 11 | Not in universe | 0 | 0 | Children | 0 | Not in universe | Never married | Not in universe or children | Not in universe | ... | United-States | United-States | United-States | Native- Born in the United States | 0 | Not in universe | 0 | 0 | 94 | - 50000. |
| 4967 | 4 | Not in universe | 0 | 0 | Children | 0 | Not in universe | Never married | Not in universe or children | Not in universe | ... | United-States | United-States | United-States | Native- Born in the United States | 0 | Not in universe | 0 | 0 | 95 | - 50000. |
| 5175 | 9 | Not in universe | 0 | 0 | Children | 0 | Not in universe | Never married | Not in universe or children | Not in universe | ... | United-States | United-States | United-States | Native- Born in the United States | 0 | Not in universe | 0 | 0 | 94 | - 50000. |
| 5728 | 12 | Not in universe | 0 | 0 | Children | 0 | Not in universe | Never married | Not in universe or children | Not in universe | ... | United-States | United-States | United-States | Native- Born in the United States | 0 | Not in universe | 0 | 0 | 95 | - 50000. |
| 5935 | 0 | Not in universe | 0 | 0 | Children | 0 | Not in universe | Never married | Not in universe or children | Not in universe | ... | United-States | United-States | United-States | Native- Born in the United States | 0 | Not in universe | 0 | 0 | 94 | - 50000. |
5 rows × 42 columns
# Droppping the duplicated rows
data = data.drop_duplicates()
data.duplicated().sum()
0
import missingno as msno
# create matrix plot of missing values
msno.matrix(data)
<AxesSubplot:>
# Check for missing values
data.isnull().sum()
age 0 class of worker 0 detailed industry recode 0 detailed occupation recode 0 education 0 wage per hour 0 enroll in edu inst last wk 0 marital stat 0 major industry code 0 major occupation code 0 race 0 hispanic origin 870 sex 0 member of a labor union 0 reason for unemployment 0 full or part time employment stat 0 capital gains 0 capital losses 0 dividends from stocks 0 tax filer stat 0 region of previous residence 0 state of previous residence 0 detailed household and family stat 0 detailed household summary in household 0 weight 0 migration code-change in msa 0 migration code-change in reg 0 migration code-move within reg 0 live in this house 1 year ago 0 migration prev res in sunbelt 0 num persons worked for employer 0 family members under 18 0 country of birth father 0 country of birth mother 0 country of birth self 0 citizenship 0 own business or self employed 0 fill inc questionnaire for veteran's admin 0 veterans benefits 0 weeks worked in year 0 year 0 label 0 dtype: int64
# Counts of each unique value
data['hispanic origin'].value_counts()
All other 168803 Mexican-American 8008 Mexican (Mexicano) 7210 Central or South American 3891 Puerto Rican 3306 Other Spanish 2476 Cuban 1122 Do not know 305 Chicano 303 Name: hispanic origin, dtype: int64
# Create a function to determine the origin based on country of birth of self,mother and father
def determine_origin(row):
us_origin = ['United-States']
central_american_origin = ['Belize', 'Costa Rica', 'El Salvador', 'Guatemala', 'Honduras', 'Mexico', 'Nicaragua', 'Panama']
south_american_origin = ['Argentina', 'Bolivia', 'Brazil', 'Chile', 'Colombia', 'Ecuador', 'Guyana', 'Paraguay', 'Peru', 'Suriname', 'Uruguay', 'Venezuela']
if row['country of birth self'] in us_origin and row['country of birth father'] in us_origin and row['country of birth mother'] in us_origin:
return 'Not Hispanic'
elif row['country of birth self'] in central_american_origin or row['country of birth father'] in central_american_origin or row['country of birth mother'] in central_american_origin:
return 'Central_American'
elif row['country of birth self'] in south_american_origin or row['country of birth father'] in south_american_origin or row['country of birth mother'] in south_american_origin:
return 'South_American'
else:
return 'other'
# Apply the function to create the "origin" column
data['hispanic origin'] = data.apply(determine_origin, axis=1)
# Modified unique values of 'hispanic origin'
data['hispanic origin'].value_counts()
Not Hispanic 150297 other 32915 Central_American 12274 South_American 808 Name: hispanic origin, dtype: int64
The hispanic origin is modified such that
If country of birth self,country of birth father and country of birth mother are "United-States" then the hispanic origin is "Not Hispanic"
If country of birth self or country of birth father or country of birth mother is in Central American countries their hispanic origin is "Central_American"
If country of birth self or country of birth father or country of birth mother is in South American countries their hispanic origin is "South_American"
Else the hispanic origin is "other"
# Check for missing values
data.isnull().sum()
age 0 class of worker 0 detailed industry recode 0 detailed occupation recode 0 education 0 wage per hour 0 enroll in edu inst last wk 0 marital stat 0 major industry code 0 major occupation code 0 race 0 hispanic origin 0 sex 0 member of a labor union 0 reason for unemployment 0 full or part time employment stat 0 capital gains 0 capital losses 0 dividends from stocks 0 tax filer stat 0 region of previous residence 0 state of previous residence 0 detailed household and family stat 0 detailed household summary in household 0 weight 0 migration code-change in msa 0 migration code-change in reg 0 migration code-move within reg 0 live in this house 1 year ago 0 migration prev res in sunbelt 0 num persons worked for employer 0 family members under 18 0 country of birth father 0 country of birth mother 0 country of birth self 0 citizenship 0 own business or self employed 0 fill inc questionnaire for veteran's admin 0 veterans benefits 0 weeks worked in year 0 year 0 label 0 dtype: int64
# Group the data by hispanic origin and income label
grouped = data.groupby(['hispanic origin', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='hispanic origin', y='count', hue='label', kind='bar')
plt.title('Relationship between hispanic origin and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0a0912ac0>
# Garbage values
# get the count of garbage values in each column
garbage_count = data.apply(lambda x: x[x == '?'].count())
# print the count of garbage values in each column
garbage_count.sort_values(ascending=False)
migration code-change in msa 98015 migration code-change in reg 98015 migration code-move within reg 98015 migration prev res in sunbelt 98015 country of birth father 6703 country of birth mother 6107 country of birth self 3389 state of previous residence 707 citizenship 0 own business or self employed 0 class of worker 0 family members under 18 0 num persons worked for employer 0 fill inc questionnaire for veteran's admin 0 live in this house 1 year ago 0 veterans benefits 0 weeks worked in year 0 year 0 weight 0 detailed household summary in household 0 detailed household and family stat 0 age 0 region of previous residence 0 tax filer stat 0 detailed industry recode 0 detailed occupation recode 0 education 0 wage per hour 0 enroll in edu inst last wk 0 marital stat 0 major industry code 0 major occupation code 0 race 0 hispanic origin 0 sex 0 member of a labor union 0 reason for unemployment 0 full or part time employment stat 0 capital gains 0 capital losses 0 dividends from stocks 0 label 0 dtype: int64
# Counts of each unique value
data['migration code-change in msa'].value_counts()
? 98015 Nonmover 81128 MSA to MSA 10572 NonMSA to nonMSA 2802 Not in universe 1419 MSA to nonMSA 787 NonMSA to MSA 615 Abroad to MSA 453 Not identifiable 430 Abroad to nonMSA 73 Name: migration code-change in msa, dtype: int64
# Counts of each unique value
data['migration code-change in reg'].value_counts()
? 98015 Nonmover 81128 Same county 9779 Different county same state 2792 Not in universe 1419 Different region 1178 Different state same division 990 Abroad 530 Different division same region 463 Name: migration code-change in reg, dtype: int64
# Counts of each unique value
data['migration code-move within reg'].value_counts()
? 98015 Nonmover 81128 Same county 9779 Different county same state 2792 Not in universe 1419 Different state in South 972 Different state in West 678 Different state in Midwest 551 Abroad 530 Different state in Northeast 430 Name: migration code-move within reg, dtype: int64
# Counts of each unique value
data['migration prev res in sunbelt'].value_counts()
? 98015 Not in universe 82547 No 9959 Yes 5773 Name: migration prev res in sunbelt, dtype: int64
data = data.drop(['migration code-change in msa', 'migration code-change in reg',
'migration code-move within reg', 'migration prev res in sunbelt'], axis=1)
# Counts of each unique value
data['country of birth father'].value_counts()
United-States 156037 Mexico 9948 ? 6703 Puerto-Rico 2676 Italy 2212 Canada 1380 Germany 1356 Dominican-Republic 1284 Poland 1210 Philippines 1152 Cuba 1121 El-Salvador 982 China 854 England 791 Columbia 614 India 579 South Korea 530 Ireland 508 Jamaica 461 Vietnam 457 Guatemala 443 Japan 391 Portugal 387 Ecuador 378 Haiti 351 Greece 344 Peru 335 Nicaragua 314 Hungary 306 Scotland 247 Iran 233 Yugoslavia 217 Taiwan 199 Cambodia 195 Honduras 194 France 191 Outlying-U S (Guam USVI etc) 159 Laos 154 Trinadad&Tobago 112 Thailand 107 Hong Kong 106 Holand-Netherlands 51 Panama 25 Name: country of birth father, dtype: int64
# Counts of each unique value
data['country of birth mother'].value_counts()
United-States 157355 Mexico 9721 ? 6107 Puerto-Rico 2468 Italy 1844 Canada 1451 Germany 1382 Philippines 1228 Poland 1109 El-Salvador 1107 Cuba 1104 Dominican-Republic 1097 England 901 China 759 Columbia 612 South Korea 607 Ireland 599 India 580 Vietnam 473 Japan 468 Jamaica 452 Guatemala 442 Ecuador 375 Peru 355 Haiti 353 Portugal 341 Nicaragua 301 Hungary 297 Greece 261 Scotland 241 Taiwan 222 Honduras 218 France 212 Iran 198 Yugoslavia 177 Outlying-U S (Guam USVI etc) 157 Cambodia 156 Laos 155 Thailand 123 Hong Kong 107 Trinadad&Tobago 98 Holand-Netherlands 49 Panama 32 Name: country of birth mother, dtype: int64
# Counts of each unique value
data['country of birth self'].value_counts()
United-States 173783 Mexico 5759 ? 3389 Puerto-Rico 1400 Germany 850 Philippines 844 Cuba 836 Canada 700 El-Salvador 689 Dominican-Republic 687 China 478 South Korea 471 England 455 Columbia 434 Italy 419 India 407 Vietnam 391 Poland 381 Guatemala 343 Japan 339 Jamaica 320 Peru 268 Ecuador 258 Haiti 228 Nicaragua 218 Taiwan 201 Portugal 174 Iran 157 Greece 147 Honduras 144 Ireland 135 France 121 Outlying-U S (Guam USVI etc) 119 Thailand 113 Laos 105 Hong Kong 100 Cambodia 94 Hungary 79 Scotland 75 Trinadad&Tobago 66 Yugoslavia 66 Panama 28 Holand-Netherlands 23 Name: country of birth self, dtype: int64
# Counts of each unique value
data['state of previous residence'].value_counts()
Not in universe 180562 California 1710 Utah 1061 Florida 847 North Carolina 810 ? 707 Abroad 671 Oklahoma 622 Minnesota 572 Indiana 528 North Dakota 497 New Mexico 462 Michigan 441 Alaska 290 Kentucky 243 Arizona 243 New Hampshire 242 Wyoming 241 Colorado 239 Oregon 236 West Virginia 231 Georgia 227 Montana 226 Alabama 216 Ohio 211 Texas 208 Mississippi 204 Arkansas 203 Tennessee 200 Pennsylvania 199 New York 195 Louisiana 192 Vermont 191 Iowa 189 Illinois 180 Nebraska 177 Missouri 174 Nevada 174 Maine 166 Massachusetts 151 Kansas 149 South Dakota 138 Maryland 135 Virginia 126 Connecticut 117 District of Columbia 113 Wisconsin 105 South Carolina 95 New Jersey 75 Delaware 73 Idaho 30 Name: state of previous residence, dtype: int64
# Relacing the '?' grabage values as 'Other/Not known'
data['country of birth father'] = np.where(data['country of birth father'] == '?', 'Other/Not known', data['country of birth father'])
data['country of birth mother'] = np.where(data['country of birth mother'] == '?', 'Other/Not known', data['country of birth mother'])
data['country of birth self'] = np.where(data['country of birth self'] == '?', 'Other/Not known', data['country of birth self'])
data['state of previous residence'] = np.where(data['state of previous residence'] == '?', 'Other/Not known', data['state of previous residence'])
# Counts of each unique value
data['country of birth father'].value_counts()
United-States 156037 Mexico 9948 Other/Not known 6703 Puerto-Rico 2676 Italy 2212 Canada 1380 Germany 1356 Dominican-Republic 1284 Poland 1210 Philippines 1152 Cuba 1121 El-Salvador 982 China 854 England 791 Columbia 614 India 579 South Korea 530 Ireland 508 Jamaica 461 Vietnam 457 Guatemala 443 Japan 391 Portugal 387 Ecuador 378 Haiti 351 Greece 344 Peru 335 Nicaragua 314 Hungary 306 Scotland 247 Iran 233 Yugoslavia 217 Taiwan 199 Cambodia 195 Honduras 194 France 191 Outlying-U S (Guam USVI etc) 159 Laos 154 Trinadad&Tobago 112 Thailand 107 Hong Kong 106 Holand-Netherlands 51 Panama 25 Name: country of birth father, dtype: int64
# Counts of each unique value
data['country of birth mother'].value_counts()
United-States 157355 Mexico 9721 Other/Not known 6107 Puerto-Rico 2468 Italy 1844 Canada 1451 Germany 1382 Philippines 1228 Poland 1109 El-Salvador 1107 Cuba 1104 Dominican-Republic 1097 England 901 China 759 Columbia 612 South Korea 607 Ireland 599 India 580 Vietnam 473 Japan 468 Jamaica 452 Guatemala 442 Ecuador 375 Peru 355 Haiti 353 Portugal 341 Nicaragua 301 Hungary 297 Greece 261 Scotland 241 Taiwan 222 Honduras 218 France 212 Iran 198 Yugoslavia 177 Outlying-U S (Guam USVI etc) 157 Cambodia 156 Laos 155 Thailand 123 Hong Kong 107 Trinadad&Tobago 98 Holand-Netherlands 49 Panama 32 Name: country of birth mother, dtype: int64
# Counts of each unique value
data['country of birth self'].value_counts()
United-States 173783 Mexico 5759 Other/Not known 3389 Puerto-Rico 1400 Germany 850 Philippines 844 Cuba 836 Canada 700 El-Salvador 689 Dominican-Republic 687 China 478 South Korea 471 England 455 Columbia 434 Italy 419 India 407 Vietnam 391 Poland 381 Guatemala 343 Japan 339 Jamaica 320 Peru 268 Ecuador 258 Haiti 228 Nicaragua 218 Taiwan 201 Portugal 174 Iran 157 Greece 147 Honduras 144 Ireland 135 France 121 Outlying-U S (Guam USVI etc) 119 Thailand 113 Laos 105 Hong Kong 100 Cambodia 94 Hungary 79 Scotland 75 Trinadad&Tobago 66 Yugoslavia 66 Panama 28 Holand-Netherlands 23 Name: country of birth self, dtype: int64
# Counts of each unique value
data['state of previous residence'].value_counts()
Not in universe 180562 California 1710 Utah 1061 Florida 847 North Carolina 810 Other/Not known 707 Abroad 671 Oklahoma 622 Minnesota 572 Indiana 528 North Dakota 497 New Mexico 462 Michigan 441 Alaska 290 Kentucky 243 Arizona 243 New Hampshire 242 Wyoming 241 Colorado 239 Oregon 236 West Virginia 231 Georgia 227 Montana 226 Alabama 216 Ohio 211 Texas 208 Mississippi 204 Arkansas 203 Tennessee 200 Pennsylvania 199 New York 195 Louisiana 192 Vermont 191 Iowa 189 Illinois 180 Nebraska 177 Missouri 174 Nevada 174 Maine 166 Massachusetts 151 Kansas 149 South Dakota 138 Maryland 135 Virginia 126 Connecticut 117 District of Columbia 113 Wisconsin 105 South Carolina 95 New Jersey 75 Delaware 73 Idaho 30 Name: state of previous residence, dtype: int64
# Checking for garbage values in modified data
# get the count of garbage values in each column
garbage_count = data.apply(lambda x: x[x == '?'].count())
# print the count of garbage values in each column
garbage_count.sort_values(ascending=False)
age 0 country of birth father 0 state of previous residence 0 detailed household and family stat 0 detailed household summary in household 0 weight 0 live in this house 1 year ago 0 num persons worked for employer 0 family members under 18 0 country of birth mother 0 class of worker 0 country of birth self 0 citizenship 0 own business or self employed 0 fill inc questionnaire for veteran's admin 0 veterans benefits 0 weeks worked in year 0 year 0 region of previous residence 0 tax filer stat 0 dividends from stocks 0 capital losses 0 detailed industry recode 0 detailed occupation recode 0 education 0 wage per hour 0 enroll in edu inst last wk 0 marital stat 0 major industry code 0 major occupation code 0 race 0 hispanic origin 0 sex 0 member of a labor union 0 reason for unemployment 0 full or part time employment stat 0 capital gains 0 label 0 dtype: int64
# Calculate the percentage of "Not in universe" in each column
percentages = (data == 'Not in universe').mean() * 100
# Print the percentages for each column
for col, percent in zip(data.columns, percentages):
print(f"{col}: {percent:.2f}%")
age: 0.00% class of worker: 49.43% detailed industry recode: 0.00% detailed occupation recode: 0.00% education: 0.00% wage per hour: 0.00% enroll in edu inst last wk: 93.62% marital stat: 0.00% major industry code: 0.00% major occupation code: 49.65% race: 0.00% hispanic origin: 0.00% sex: 0.00% member of a labor union: 90.29% reason for unemployment: 96.91% full or part time employment stat: 0.00% capital gains: 0.00% capital losses: 0.00% dividends from stocks: 0.00% tax filer stat: 0.00% region of previous residence: 91.99% state of previous residence: 91.99% detailed household and family stat: 0.00% detailed household summary in household: 0.00% weight: 0.00% live in this house 1 year ago: 0.00% num persons worked for employer: 0.00% family members under 18: 73.44% country of birth father: 0.00% country of birth mother: 0.00% country of birth self: 0.00% citizenship: 0.00% own business or self employed: 0.00% fill inc questionnaire for veteran's admin: 98.99% veterans benefits: 0.00% weeks worked in year: 0.00% year: 0.00% label: 0.00%
# group age by reason for unemployment
age_income = data.groupby(['label', pd.cut(data['age'], range(0, 101, 10))])['age'].count().unstack(level=0)
# create bar plot
age_income.plot(kind='bar', stacked=True)
plt.xlabel('Age Group')
plt.ylabel('Count')
plt.title('Income by Age Group')
Text(0.5, 1.0, 'Income by Age Group')
Most individuals who have above 50,000 income are older than 20 years
Individuals between 30 to 60 years, paticularly are mostly having income more than 50,000
The income information for individuals after 40 years mark are gradually reducing
# Counts of each unique value
data['class of worker'].value_counts()
Not in universe 97029 Private 72021 Self-employed-not incorporated 8442 Local government 7783 State government 4227 Self-employed-incorporated 3264 Federal government 2925 Never worked 438 Without pay 165 Name: class of worker, dtype: int64
# group age by class of worker
age_classofworker = data.groupby(['class of worker', pd.cut(data['age'], range(0, 101, 10))])['age'].count().unstack(level=0)
# create bar plot
age_classofworker.plot(kind='bar', stacked=True)
plt.xlabel('Age Group')
plt.ylabel('Count')
plt.title('Class of worker by Age Group')
Text(0.5, 1.0, 'Class of worker by Age Group')
# Replacing the class of worker value as Children/Student for age less than 18
data.loc[data['age'] < 18, 'class of worker'] = 'Children/Student'
If the people have
then it is likely that they might not be working at the time
# Replace 'Not in universe' with 'Not working'
conditions = ((data['full or part time employment stat'].isin(['Not in labor force', 'Unemployed full-time', 'Unemployed part-time'])) & (data['age'] > 18) & (data['wage per hour'] == 0))
data.loc[conditions, 'class of worker'] = data.loc[conditions, 'class of worker'].replace('Not in universe', 'Not working')
# Counts of each unique value
data['class of worker'].value_counts()
Private 69839 Children/Student 52826 Not in universe 24320 Not working 22621 Self-employed-not incorporated 8304 Local government 7700 State government 4205 Self-employed-incorporated 3226 Federal government 2915 Never worked 193 Without pay 145 Name: class of worker, dtype: int64
# Group the data by class of worker and income label
grouped = data.groupby(['class of worker', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='class of worker', y='count', hue='label', kind='bar')
plt.title('Relationship between class of worker and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0a0b49be0>
# Range of the Industry recode
print(data['detailed industry recode'].min())
print(data['detailed industry recode'].max())
0 51
# Create violin plot
sns.violinplot(x='label', y='detailed industry recode', data=data)
plt.title('Distribution of Industry code with Income label')
Text(0.5, 1.0, 'Distribution of Industry code with Income label')
# Range of the occupation recode
print(data['detailed occupation recode'].min())
print(data['detailed occupation recode'].max())
0 46
# Create violin plot
sns.violinplot(x='label', y='detailed occupation recode', data=data)
plt.title('Distribution of occupation code with Income label')
Text(0.5, 1.0, 'Distribution of occupation code with Income label')
# Counts of each unique value
data['education'].value_counts()
High school graduate 48374 Children 44347 Some college but no degree 27809 Bachelors degree(BA AB BS) 19859 7th and 8th grade 7976 10th grade 7539 11th grade 6862 Masters degree(MA MS MEng MEd MSW MBA) 6540 9th grade 6193 Associates degree-occup /vocational 5358 Associates degree-academic program 4363 5th or 6th grade 3277 12th grade no diploma 2125 1st 2nd 3rd or 4th grade 1799 Prof school degree (MD DDS DVM LLB JD) 1793 Doctorate degree(PhD EdD) 1263 Less than 1st grade 817 Name: education, dtype: int64
data['education'].replace({'Less than 1st grade':'Elementary/Middle School',
'1st 2nd 3rd or 4th grade': 'Elementary/Middle School',
'5th or 6th grade': 'Elementary/Middle School',
'7th and 8th grade': 'Elementary/Middle School',
'9th grade': 'High School',
'10th grade': 'High School',
'11th grade': 'High School',
'12th grade no diploma': 'High School',
'High school graduate': 'High School',
'Some college but no degree': 'Dropout'}, inplace=True)
# Counts of each unique value
data['education'].value_counts()
High School 71093 Children 44347 Dropout 27809 Bachelors degree(BA AB BS) 19859 Elementary/Middle School 13869 Masters degree(MA MS MEng MEd MSW MBA) 6540 Associates degree-occup /vocational 5358 Associates degree-academic program 4363 Prof school degree (MD DDS DVM LLB JD) 1793 Doctorate degree(PhD EdD) 1263 Name: education, dtype: int64
# Group the data by education and income label
grouped = data.groupby(['education', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='education', y='count', hue='label', kind='bar')
plt.title('Relationship between education and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0b2858d00>
# Range of the Industry recode
print(data['wage per hour'].min())
print(data['wage per hour'].max())
0 9999
# Create violin plot
sns.violinplot(x='label', y='wage per hour', data=data)
plt.xticks(rotation=90)
plt.title('Distribution of wage per hour with Income label')
Text(0.5, 1.0, 'Distribution of wage per hour with Income label')
We can infer that the majority of the values in the 'wage per hour' column are 0, since both the median (50th percentile) and the third quartile (75th percentile) are 0
Fascinatingly, the majority of individuals with incomes of 50,000 or more have wages per hour equal to zero
This could indicate that they have an alternative means of income to employment or that the data is flawed
# Counts of each unique value
data['enroll in edu inst last wk'].value_counts()
Not in universe 183762 High school 6853 College or university 5679 Name: enroll in edu inst last wk, dtype: int64
# Group the data by enroll in edu inst last wk and income label
grouped = data.groupby(['enroll in edu inst last wk', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='enroll in edu inst last wk', y='count', hue='label', kind='bar')
plt.title('Relationship between enroll in edu inst last wk and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0c228d190>
# Counts of each unique value
data['marital stat'].value_counts()
Married-civilian spouse present 84194 Never married 83296 Divorced 12707 Widowed 10456 Separated 3459 Married-spouse absent 1517 Married-A F spouse present 665 Name: marital stat, dtype: int64
# Group the data by marital stat and income label
grouped = data.groupby(['marital stat', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='marital stat', y='count', hue='label', kind='bar')
plt.title('Relationship between marital stat and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0c1b73c10>
# Counts of each unique value
data['major industry code'].value_counts()
Not in universe or children 97467 Retail trade 17069 Manufacturing-durable goods 9014 Education 8283 Manufacturing-nondurable goods 6895 Finance insurance and real estate 6145 Construction 5984 Business and repair services 5651 Medical except hospital 4683 Public administration 4610 Other professional services 4482 Transportation 4209 Hospital services 3964 Wholesale trade 3594 Agriculture 3021 Personal services except private HH 2937 Social services 2547 Entertainment 1650 Communications 1181 Utilities and sanitary services 1178 Private household services 945 Mining 563 Forestry and fisheries 186 Armed Forces 36 Name: major industry code, dtype: int64
# Separating 'Not in universe' and Children
data.loc[data['age'] < 18, 'major industry code'] = 'Children'
data['major industry code'].replace('Not in universe or children', 'Not in universe', inplace=True)
# Counts of each unique value
data['major industry code'].value_counts()
Children 52826 Not in universe 47134 Retail trade 15686 Manufacturing-durable goods 8987 Education 8196 Manufacturing-nondurable goods 6824 Finance insurance and real estate 6119 Construction 5934 Business and repair services 5549 Medical except hospital 4627 Public administration 4590 Other professional services 4436 Transportation 4197 Hospital services 3960 Wholesale trade 3562 Personal services except private HH 2883 Agriculture 2862 Social services 2461 Entertainment 1513 Communications 1175 Utilities and sanitary services 1173 Private household services 820 Mining 561 Forestry and fisheries 184 Armed Forces 35 Name: major industry code, dtype: int64
# Group the data by major industry code and income label
grouped = data.groupby(['major industry code', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='major industry code', y='count', hue='label', kind='bar')
plt.title('Relationship between major industry code and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0a0d70a90>
For individuals earning 50,000 or less, the majority are either 'children' or 'not in universe', indicating that a significant proportion of this population might not be a part of the workforce or not old enough to work
For individuals earning more than 50,000, 'manufacturing-durable goods' is the most common industry
# Counts of each unique value
data['major occupation code'].value_counts()
Not in universe 97467 Adm support including clerical 14836 Professional specialty 13940 Executive admin and managerial 12495 Other service 12097 Sales 11781 Precision production craft & repair 10517 Machine operators assmblrs & inspctrs 6377 Handlers equip cleaners etc 4126 Transportation and material moving 4020 Farming forestry and fishing 3143 Technicians and related support 3018 Protective services 1661 Private household services 780 Armed Forces 36 Name: major occupation code, dtype: int64
# Separating 'Not in universe' and Children
data.loc[data['age'] < 18, 'major occupation code'] = 'Children'
data['major occupation code'].replace('Not in universe or children', 'Not in universe', inplace=True)
# Counts of each unique value
data['major occupation code'].value_counts()
Children 52826 Not in universe 47134 Adm support including clerical 14657 Professional specialty 13868 Executive admin and managerial 12485 Sales 11188 Other service 11178 Precision production craft & repair 10473 Machine operators assmblrs & inspctrs 6346 Transportation and material moving 3995 Handlers equip cleaners etc 3814 Technicians and related support 3012 Farming forestry and fishing 2964 Protective services 1646 Private household services 673 Armed Forces 35 Name: major occupation code, dtype: int64
# Group the data by major occupation code and income label
grouped = data.groupby(['major occupation code', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='major occupation code', y='count', hue='label', kind='bar')
plt.title('Relationship between major occupation code and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0c1bdfbb0>
# Counts of each unique value
data['race'].value_counts()
White 164380 Black 20206 Asian or Pacific Islander 5821 Other 3645 Amer Indian Aleut or Eskimo 2242 Name: race, dtype: int64
# Group the data by race and income label
grouped = data.groupby(['race', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='race', y='count', hue='label', kind='bar')
plt.title('Relationship between race and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0a0d70d60>
# Counts of each unique value
data['sex'].value_counts()
Female 102400 Male 93894 Name: sex, dtype: int64
# Group the data by sex and income label
grouped = data.groupby(['sex', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='sex', y='count', hue='label', kind='bar')
plt.title('Relationship between sex and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0c224dac0>
# Counts of each unique value
data['member of a labor union'].value_counts()
Not in universe 177232 No 16032 Yes 3030 Name: member of a labor union, dtype: int64
# Counts of each unique value
data['reason for unemployment'].value_counts()
Not in universe 190226 Other job loser 2038 Re-entrant 2018 Job loser - on layoff 976 Job leaver 598 New entrant 438 Name: reason for unemployment, dtype: int64
# Counts of each unique value
data['full or part time employment stat'].value_counts()
Children or Armed Forces 120632 Full-time schedules 40728 Not in labor force 26726 PT for non-econ reasons usually FT 3322 Unemployed full-time 2310 PT for econ reasons usually PT 1209 Unemployed part- time 842 PT for econ reasons usually FT 525 Name: full or part time employment stat, dtype: int64
# Replace values in 'full or part time employment stat'
data.loc[(data['full or part time employment stat'] == 'Children or Armed Forces') & (data['age'] < 18), 'full or part time employment stat'] = 'Children'
data['full or part time employment stat'] = data['full or part time employment stat'].replace('Children or Armed Forces', 'Armed Forces')
replace_map = {'PT for econ reasons usually FT': 'Part Time',
'PT for econ reasons usually PT': 'Part Time',
'PT for non-econ reasons usually FT': 'Part Time',
'Unemployed full-time': 'Unemployed',
'Unemployed part- time': 'Unemployed',
'Not in labor force': 'Unemployed'}
data['full or part time employment stat'] = data['full or part time employment stat'].replace(replace_map)
# Counts of each unique value
data['full or part time employment stat'].value_counts()
Armed Forces 72083 Children 48549 Full-time schedules 40728 Unemployed 29878 Part Time 5056 Name: full or part time employment stat, dtype: int64
# Group the data by full or part time employment stat and income label
grouped = data.groupby(['full or part time employment stat', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='full or part time employment stat', y='count', hue='label', kind='bar')
plt.title('Relationship between full or part time employment stat and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0a12d2610>
# Statistics of 'capital gains', 'capital losses', 'dividends from stocks'
data[['capital gains', 'capital losses', 'dividends from stocks']].describe()
| capital gains | capital losses | dividends from stocks | |
|---|---|---|---|
| count | 196294.000000 | 196294.000000 | 196294.000000 |
| mean | 441.870037 | 37.927593 | 200.722386 |
| std | 4735.677027 | 274.081174 | 2000.130616 |
| min | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 0.000000 | 0.000000 |
| 75% | 0.000000 | 0.000000 | 0.000000 |
| max | 99999.000000 | 4608.000000 | 99999.000000 |
# Counts of each unique value
data['tax filer stat'].value_counts()
Nonfiler 71903 Joint both under 65 67367 Single 37409 Joint both 65+ 8326 Head of household 7425 Joint one under 65 & one 65+ 3864 Name: tax filer stat, dtype: int64
# Group the data by tax filer stat and income label
grouped = data.groupby(['tax filer stat', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='tax filer stat', y='count', hue='label', kind='bar')
plt.title('Relationship between tax filer stat and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0d1bcddc0>
# Counts of each unique value
data['region of previous residence'].value_counts()
Not in universe 180562 South 4875 West 4068 Midwest 3559 Northeast 2700 Abroad 530 Name: region of previous residence, dtype: int64
# Counts of each unique value
data['state of previous residence'].value_counts()
Not in universe 180562 California 1710 Utah 1061 Florida 847 North Carolina 810 Other/Not known 707 Abroad 671 Oklahoma 622 Minnesota 572 Indiana 528 North Dakota 497 New Mexico 462 Michigan 441 Alaska 290 Kentucky 243 Arizona 243 New Hampshire 242 Wyoming 241 Colorado 239 Oregon 236 West Virginia 231 Georgia 227 Montana 226 Alabama 216 Ohio 211 Texas 208 Mississippi 204 Arkansas 203 Tennessee 200 Pennsylvania 199 New York 195 Louisiana 192 Vermont 191 Iowa 189 Illinois 180 Nebraska 177 Missouri 174 Nevada 174 Maine 166 Massachusetts 151 Kansas 149 South Dakota 138 Maryland 135 Virginia 126 Connecticut 117 District of Columbia 113 Wisconsin 105 South Carolina 95 New Jersey 75 Delaware 73 Idaho 30 Name: state of previous residence, dtype: int64
# Counts of each unique value
data['detailed household and family stat'].value_counts()
Householder 53242 Child <18 never marr not in subfamily 47219 Spouse of householder 41670 Nonfamily householder 22205 Child 18+ never marr Not in a subfamily 12016 Secondary individual 6109 Other Rel 18+ ever marr not in subfamily 1953 Grandchild <18 never marr child of subfamily RP 1839 Other Rel 18+ never marr not in subfamily 1726 Grandchild <18 never marr not in subfamily 1055 Child 18+ ever marr Not in a subfamily 1013 Child under 18 of RP of unrel subfamily 729 RP of unrelated subfamily 685 Child 18+ ever marr RP of subfamily 671 Other Rel 18+ ever marr RP of subfamily 656 Other Rel <18 never marr child of subfamily RP 653 Other Rel 18+ spouse of subfamily RP 638 Child 18+ never marr RP of subfamily 589 Other Rel <18 never marr not in subfamily 580 Grandchild 18+ never marr not in subfamily 375 In group quarters 195 Child 18+ spouse of subfamily RP 126 Other Rel 18+ never marr RP of subfamily 94 Child <18 never marr RP of subfamily 80 Spouse of RP of unrelated subfamily 52 Child <18 ever marr not in subfamily 36 Grandchild 18+ ever marr not in subfamily 34 Grandchild 18+ spouse of subfamily RP 10 Child <18 ever marr RP of subfamily 9 Grandchild 18+ ever marr RP of subfamily 9 Grandchild 18+ never marr RP of subfamily 6 Other Rel <18 ever marr RP of subfamily 6 Other Rel <18 never married RP of subfamily 4 Other Rel <18 spouse of subfamily RP 3 Child <18 spouse of subfamily RP 2 Grandchild <18 never marr RP of subfamily 2 Grandchild <18 ever marr not in subfamily 2 Other Rel <18 ever marr not in subfamily 1 Name: detailed household and family stat, dtype: int64
# Counts of each unique value
data['detailed household summary in household'].value_counts()
Householder 75461 Child under 18 never married 47318 Spouse of householder 41684 Child 18 or older 14416 Other relative of householder 9651 Nonrelative of householder 7585 Group Quarters- Secondary individual 132 Child under 18 ever married 47 Name: detailed household summary in household, dtype: int64
# Group the data by detailed household summary in household and income label
grouped = data.groupby(['detailed household summary in household', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='detailed household summary in household', y='count', hue='label', kind='bar')
plt.title('Relationship between detailed household summary in household and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0a1068f40>
# Range of the Industry recode
print(data['weight'].min())
print(data['weight'].max())
data['weight'].describe()
37.87 18656.3
count 196294.000000 mean 1743.267584 std 996.945985 min 37.870000 25% 1061.530000 50% 1620.175000 75% 2194.060000 max 18656.300000 Name: weight, dtype: float64
# Counts of each unique value
data['live in this house 1 year ago'].value_counts()
Not in universe under 1 year old 99434 Yes 81128 No 15732 Name: live in this house 1 year ago, dtype: int64
# Counts of each unique value
data['num persons worked for employer'].value_counts()
0 92770 6 36507 1 23103 4 14377 3 13424 2 10079 5 6034 Name: num persons worked for employer, dtype: int64
# Group the data by num persons worked for employer and income label
grouped = data.groupby(['num persons worked for employer', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='num persons worked for employer', y='count', hue='label', kind='bar')
plt.title('Relationship between num persons worked for employer and Income Labels')
Text(0.5, 1.0, 'Relationship between num persons worked for employer and Income Labels')
# Counts of each unique value
data['family members under 18'].value_counts()
Not in universe 144161 Both parents present 36107 Mother only present 12517 Father only present 1871 Neither parent present 1638 Name: family members under 18, dtype: int64
# Counts of each unique value
data['citizenship'].value_counts()
Native- Born in the United States 173786 Foreign born- Not a citizen of U S 13385 Foreign born- U S citizen by naturalization 5851 Native- Born abroad of American Parent(s) 1753 Native- Born in Puerto Rico or U S Outlying 1519 Name: citizenship, dtype: int64
# Group the data by citizenship and income label
grouped = data.groupby(['citizenship', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='citizenship', y='count', hue='label', kind='bar')
plt.title('Relationship between citizenship and Income Labels')
# Rotate x-axis labels
plot.set_xticklabels(rotation=90)
<seaborn.axisgrid.FacetGrid at 0x7ff0b3123580>
# Counts of each unique value
data['own business or self employed'].value_counts()
0 177445 2 16151 1 2698 Name: own business or self employed, dtype: int64
# Group the data by own business or self employed and income label
grouped = data.groupby(['own business or self employed', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='own business or self employed', y='count', hue='label', kind='bar')
plt.title('Relationship between own business or self employed and Income Labels')
Text(0.5, 1.0, 'Relationship between own business or self employed and Income Labels')
# Counts of each unique value
data["fill inc questionnaire for veteran's admin"].value_counts()
Not in universe 194310 No 1593 Yes 391 Name: fill inc questionnaire for veteran's admin, dtype: int64
# Counts of each unique value
data["veterans benefits"].value_counts()
2 149976 0 44334 1 1984 Name: veterans benefits, dtype: int64
# Group the data by veterans benefits and income label
grouped = data.groupby(['veterans benefits', 'label']).size().reset_index(name='count')
# Plot the grouped bar plot
plot = sns.catplot(data=grouped, x='veterans benefits', y='count', hue='label', kind='bar')
plt.title('Relationship between veterans benefits and Income Labels')
Text(0.5, 1.0, 'Relationship between veterans benefits and Income Labels')
# Range of the Industry recode
print(data['weeks worked in year'].min())
print(data['weeks worked in year'].max())
data['weeks worked in year'].describe()
0 52
count 196294.000000 mean 23.553889 std 24.428588 min 0.000000 25% 0.000000 50% 12.000000 75% 52.000000 max 52.000000 Name: weeks worked in year, dtype: float64
# Create violin plot
sns.violinplot(x='label', y='weeks worked in year', data=data)
plt.title('Distribution of weeks worked in year with Income label')
Text(0.5, 1.0, 'Distribution of weeks worked in year with Income label')
Most people either work for a single week or for an entire year. (i.e. 52 weeks)
Throughout the year, fewer people work part-time
It is evident that the majority of individuals with annual incomes of at least 50,000 work for an entire year
# Counts of each unique value
data["year"].value_counts()
94 98279 95 98015 Name: year, dtype: int64
# Dropping the columns with large amount of 'Not in universe' values
data = data.drop(['enroll in edu inst last wk', 'major occupation code', 'member of a labor union', 'reason for unemployment', 'region of previous residence', 'state of previous residence', 'family members under 18', 'fill inc questionnaire for veteran\'s admin'], axis=1)
# Calculate the percentage of "Not in universe" in each column
percentages = (data == 'Not in universe').mean() * 100
# Print the percentages for each column
for col, percent in zip(data.columns, percentages):
print(f"{col}: {percent:.2f}%")
age: 0.00% class of worker: 12.39% detailed industry recode: 0.00% detailed occupation recode: 0.00% education: 0.00% wage per hour: 0.00% marital stat: 0.00% major industry code: 24.01% race: 0.00% hispanic origin: 0.00% sex: 0.00% full or part time employment stat: 0.00% capital gains: 0.00% capital losses: 0.00% dividends from stocks: 0.00% tax filer stat: 0.00% detailed household and family stat: 0.00% detailed household summary in household: 0.00% weight: 0.00% live in this house 1 year ago: 0.00% num persons worked for employer: 0.00% country of birth father: 0.00% country of birth mother: 0.00% country of birth self: 0.00% citizenship: 0.00% own business or self employed: 0.00% veterans benefits: 0.00% weeks worked in year: 0.00% year: 0.00% label: 0.00%
data = data.drop(['capital gains', 'capital losses', 'dividends from stocks'], axis=1)
# Printing all categorical columns
cat_cols = data.select_dtypes(include=['object']).columns
print(cat_cols)
Index(['class of worker', 'education', 'marital stat', 'major industry code',
'race', 'hispanic origin', 'sex', 'full or part time employment stat',
'tax filer stat', 'detailed household and family stat',
'detailed household summary in household',
'live in this house 1 year ago', 'country of birth father',
'country of birth mother', 'country of birth self', 'citizenship',
'label'],
dtype='object')
# Performing label encoding
from sklearn.preprocessing import LabelEncoder
categorical_cols = ['class of worker', 'education', 'marital stat', 'major industry code',
'race', 'hispanic origin', 'sex', 'full or part time employment stat',
'tax filer stat', 'detailed household and family stat',
'detailed household summary in household',
'live in this house 1 year ago', 'country of birth father',
'country of birth mother', 'country of birth self', 'citizenship',
'label']
label_encoders = {}
for col in categorical_cols:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
label_encoders[col] = le
print(label_encoders)
{'class of worker': LabelEncoder(), 'education': LabelEncoder(), 'marital stat': LabelEncoder(), 'major industry code': LabelEncoder(), 'race': LabelEncoder(), 'hispanic origin': LabelEncoder(), 'sex': LabelEncoder(), 'full or part time employment stat': LabelEncoder(), 'tax filer stat': LabelEncoder(), 'detailed household and family stat': LabelEncoder(), 'detailed household summary in household': LabelEncoder(), 'live in this house 1 year ago': LabelEncoder(), 'country of birth father': LabelEncoder(), 'country of birth mother': LabelEncoder(), 'country of birth self': LabelEncoder(), 'citizenship': LabelEncoder(), 'label': LabelEncoder()}
# Displaying first five elements of all columns
with pd.option_context('display.max_columns', None):
display(data.head())
| age | class of worker | detailed industry recode | detailed occupation recode | education | wage per hour | marital stat | major industry code | race | hispanic origin | sex | full or part time employment stat | tax filer stat | detailed household and family stat | detailed household summary in household | weight | live in this house 1 year ago | num persons worked for employer | country of birth father | country of birth mother | country of birth self | citizenship | own business or self employed | veterans benefits | weeks worked in year | year | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 73 | 5 | 0 | 0 | 7 | 0 | 6 | 15 | 4 | 1 | 0 | 4 | 4 | 24 | 6 | 1700.09 | 1 | 0 | 40 | 40 | 40 | 4 | 0 | 2 | 0 | 95 | 0 |
| 1 | 58 | 8 | 4 | 34 | 5 | 0 | 0 | 5 | 4 | 1 | 1 | 0 | 0 | 20 | 4 | 1053.55 | 0 | 1 | 40 | 40 | 40 | 4 | 0 | 2 | 52 | 94 | 0 |
| 2 | 18 | 4 | 0 | 0 | 7 | 0 | 4 | 15 | 1 | 3 | 0 | 4 | 4 | 2 | 0 | 991.95 | 1 | 0 | 41 | 41 | 41 | 0 | 0 | 2 | 0 | 95 | 0 |
| 3 | 9 | 0 | 0 | 0 | 3 | 0 | 4 | 3 | 4 | 1 | 0 | 1 | 4 | 8 | 2 | 1758.14 | 2 | 0 | 40 | 40 | 40 | 4 | 0 | 0 | 0 | 94 | 0 |
| 4 | 10 | 0 | 0 | 0 | 3 | 0 | 4 | 3 | 4 | 1 | 0 | 1 | 4 | 8 | 2 | 1069.16 | 2 | 0 | 40 | 40 | 40 | 4 | 0 | 0 | 0 | 94 | 0 |
data.shape
(196294, 27)
# create a correlation matrix
corr_matrix = data.corr()
# set the figure size
plt.figure(figsize=(25,15))
# plot the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
<AxesSubplot:>
There is high correlation between
# Dropping features to avoid high correlation
data.drop(['detailed industry recode', 'veterans benefits', 'year', 'num persons worked for employer', 'country of birth father', 'country of birth mother', 'detailed household and family stat'], axis=1, inplace=True)
# Dropping wage per hour because it may potentially reveal an individual's income or financial situation
data.drop(['wage per hour'],axis=1,inplace=True)
# create a correlation matrix
corr_matrix = data.corr()
# set the figure size
plt.figure(figsize=(25,15))
# plot the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
<AxesSubplot:>
data.shape
(196294, 19)
from sklearn.feature_selection import SelectKBest, f_classif
# Extract the input features (X) and target variable (y) from the data
X = data.drop('label', axis=1)
y = data['label']
# Define the number of features to select (k)
k = 15
# Initialize the SelectKBest object with the f_classif scoring function
selector = SelectKBest(score_func=f_classif, k=k)
# Fit the selector to the data and transform the input features
X_new = selector.fit_transform(X, y)
# Get the indices of the selected features
selected_features_indices = selector.get_support(indices=True)
# Get the names of the selected features
selected_features_names = X.columns[selected_features_indices]
# Print the names of the selected features
print(selected_features_names)
Index(['age', 'class of worker', 'marital stat', 'major industry code', 'race',
'hispanic origin', 'sex', 'full or part time employment stat',
'tax filer stat', 'detailed household summary in household', 'weight',
'live in this house 1 year ago', 'citizenship',
'own business or self employed', 'weeks worked in year'],
dtype='object')
# Dataframe with 15 best features from select k best along with label
df = data[['age', 'class of worker', 'marital stat', 'major industry code', 'race',
'hispanic origin', 'sex', 'full or part time employment stat',
'tax filer stat', 'detailed household summary in household', 'weight',
'live in this house 1 year ago', 'citizenship',
'own business or self employed', 'weeks worked in year','label']]
# Displaying first five elements of all columns
with pd.option_context('display.max_columns', None):
display(df.head())
| age | class of worker | marital stat | major industry code | race | hispanic origin | sex | full or part time employment stat | tax filer stat | detailed household summary in household | weight | live in this house 1 year ago | citizenship | own business or self employed | weeks worked in year | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 73 | 5 | 6 | 15 | 4 | 1 | 0 | 4 | 4 | 6 | 1700.09 | 1 | 4 | 0 | 0 | 0 |
| 1 | 58 | 8 | 0 | 5 | 4 | 1 | 1 | 0 | 0 | 4 | 1053.55 | 0 | 4 | 0 | 52 | 0 |
| 2 | 18 | 4 | 4 | 15 | 1 | 3 | 0 | 4 | 4 | 0 | 991.95 | 1 | 0 | 0 | 0 | 0 |
| 3 | 9 | 0 | 4 | 3 | 4 | 1 | 0 | 1 | 4 | 2 | 1758.14 | 2 | 4 | 0 | 0 | 0 |
| 4 | 10 | 0 | 4 | 3 | 4 | 1 | 0 | 1 | 4 | 2 | 1069.16 | 2 | 4 | 0 | 0 | 0 |
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
# Undersample the data
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
# Plot the distribution before and after undersampling
fig, axs = plt.subplots(ncols=2, figsize=(12,6))
sns.countplot(x='label', data=data, ax=axs[0])
axs[0].set_title('Distribution before undersampling')
sns.countplot(x=y_resampled, ax=axs[1])
axs[1].set_title('Distribution after undersampling')
Text(0.5, 1.0, 'Distribution after undersampling')
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
# KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('KNN Classifier:\n')
print('Accuracy score:', accuracy_score(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))
# Logistic regression classifier
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('Logistic Regression Classifier:\n')
print('Accuracy score:', accuracy_score(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))
# Random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('Random Forest Classifier:\n')
print('Accuracy score:', accuracy_score(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))
# SVM classifier
svm = svm.SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print('SVM Classifier:\n')
print('Accuracy score:', accuracy_score(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))
KNN Classifier:
Accuracy score: 0.7708459519483142
Confusion matrix:
[[1815 702]
[ 433 2003]]
Classification report:
precision recall f1-score support
0 0.81 0.72 0.76 2517
1 0.74 0.82 0.78 2436
accuracy 0.77 4953
macro avg 0.77 0.77 0.77 4953
weighted avg 0.77 0.77 0.77 4953
Logistic Regression Classifier:
Accuracy score: 0.8041590954976782
Confusion matrix:
[[1910 607]
[ 363 2073]]
Classification report:
precision recall f1-score support
0 0.84 0.76 0.80 2517
1 0.77 0.85 0.81 2436
accuracy 0.80 4953
macro avg 0.81 0.80 0.80 4953
weighted avg 0.81 0.80 0.80 4953
/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
Random Forest Classifier:
Accuracy score: 0.8465576418332323
Confusion matrix:
[[2066 451]
[ 309 2127]]
Classification report:
precision recall f1-score support
0 0.87 0.82 0.84 2517
1 0.83 0.87 0.85 2436
accuracy 0.85 4953
macro avg 0.85 0.85 0.85 4953
weighted avg 0.85 0.85 0.85 4953
SVM Classifier:
Accuracy score: 0.8188976377952756
Confusion matrix:
[[1931 586]
[ 311 2125]]
Classification report:
precision recall f1-score support
0 0.86 0.77 0.81 2517
1 0.78 0.87 0.83 2436
accuracy 0.82 4953
macro avg 0.82 0.82 0.82 4953
weighted avg 0.82 0.82 0.82 4953
# Grid search cv on random forest
from sklearn.model_selection import GridSearchCV
# Define the parameter grid to search over
param_grid = {
'n_estimators': [100, 200, 500],
'max_depth': [5, 10, 20, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['auto', 'sqrt']
}
# Create a Random Forest classifier object
rfc = RandomForestClassifier(random_state=42)
# Create a GridSearchCV object to search over the parameter grid
grid_search = GridSearchCV(rfc, param_grid=param_grid, cv=5, n_jobs=-1)
# Fit the GridSearchCV object on the resampled data
grid_search.fit(X_resampled, y_resampled)
# Print the best parameters found by GridSearchCV
print("Best parameters found: ", grid_search.best_params_)
/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_forest.py:427: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers. warn(
Best parameters found: {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 500}
# Random forest classifier
rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_leaf=2, min_samples_split=5, max_features='auto', random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
# Print results
print('Random Forest Classifier:\n')
print('Accuracy score:', accuracy_score(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))
/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_forest.py:427: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers. warn(
Random Forest Classifier:
Accuracy score: 0.8497880072683223
Confusion matrix:
[[2061 456]
[ 288 2148]]
Classification report:
precision recall f1-score support
0 0.88 0.82 0.85 2517
1 0.82 0.88 0.85 2436
accuracy 0.85 4953
macro avg 0.85 0.85 0.85 4953
weighted avg 0.85 0.85 0.85 4953
# Creating a sample dataset to perfom k means
sample_df = data.sample(n=5000, random_state=42)
sample_df
| age | class of worker | detailed occupation recode | education | marital stat | major industry code | race | hispanic origin | sex | full or part time employment stat | tax filer stat | detailed household summary in household | weight | live in this house 1 year ago | country of birth self | citizenship | own business or self employed | weeks worked in year | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 177099 | 73 | 4 | 0 | 2 | 2 | 15 | 4 | 1 | 1 | 0 | 1 | 4 | 493.29 | 2 | 40 | 4 | 0 | 0 | 0 |
| 110234 | 23 | 6 | 34 | 7 | 2 | 5 | 4 | 1 | 1 | 0 | 2 | 4 | 1584.68 | 2 | 40 | 4 | 0 | 52 | 0 |
| 14798 | 47 | 6 | 2 | 7 | 2 | 13 | 4 | 1 | 0 | 2 | 2 | 7 | 294.51 | 1 | 40 | 4 | 0 | 52 | 0 |
| 137689 | 20 | 6 | 24 | 5 | 2 | 2 | 4 | 0 | 0 | 4 | 2 | 7 | 839.31 | 1 | 40 | 4 | 2 | 30 | 0 |
| 192853 | 54 | 5 | 0 | 7 | 6 | 15 | 4 | 1 | 0 | 4 | 5 | 4 | 2973.42 | 1 | 40 | 4 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 74069 | 55 | 5 | 0 | 4 | 4 | 15 | 4 | 2 | 1 | 4 | 4 | 0 | 663.94 | 1 | 30 | 2 | 0 | 0 | 0 |
| 75434 | 33 | 6 | 36 | 7 | 2 | 17 | 1 | 3 | 0 | 2 | 2 | 7 | 768.77 | 1 | 41 | 1 | 0 | 43 | 0 |
| 16215 | 62 | 5 | 0 | 5 | 0 | 15 | 4 | 1 | 0 | 4 | 4 | 4 | 1382.92 | 1 | 40 | 4 | 0 | 0 | 0 |
| 65519 | 29 | 6 | 2 | 2 | 2 | 4 | 4 | 1 | 1 | 2 | 2 | 4 | 514.35 | 1 | 40 | 4 | 2 | 52 | 1 |
| 78438 | 22 | 5 | 0 | 7 | 4 | 15 | 4 | 1 | 0 | 4 | 4 | 6 | 1676.37 | 1 | 40 | 4 | 0 | 0 | 0 |
5000 rows × 19 columns
sample_df.describe()
| age | class of worker | detailed occupation recode | education | marital stat | major industry code | race | hispanic origin | sex | full or part time employment stat | tax filer stat | detailed household summary in household | weight | live in this house 1 year ago | country of birth self | citizenship | own business or self employed | weeks worked in year | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.00000 | 5000.000000 | 5000.00000 | 5000.000000 | 5000.00000 | 5000.000000 | 5000.00000 | 5000.000000 | 5000.000000 | 5000.000000 |
| mean | 35.319800 | 4.007200 | 11.655600 | 4.960000 | 2.983800 | 10.707800 | 3.619800 | 1.269800 | 0.488400 | 1.34880 | 3.187000 | 4.01520 | 1750.624876 | 1.33480 | 37.971400 | 3.61860 | 0.188200 | 23.780400 | 0.063200 |
| std | 22.108589 | 2.732442 | 14.593683 | 2.187089 | 1.403542 | 6.667272 | 0.895438 | 0.806931 | 0.499915 | 1.40639 | 1.376085 | 2.06529 | 989.691656 | 0.61513 | 6.751866 | 1.11525 | 0.570654 | 24.340079 | 0.243347 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.00000 | 59.510000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 17.000000 | 0.000000 | 0.000000 | 3.000000 | 2.000000 | 3.000000 | 4.000000 | 1.000000 | 0.000000 | 0.00000 | 2.000000 | 2.00000 | 1063.402500 | 1.00000 | 40.000000 | 4.00000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 34.000000 | 5.000000 | 2.000000 | 5.000000 | 2.000000 | 12.000000 | 4.000000 | 1.000000 | 0.000000 | 1.00000 | 4.000000 | 4.00000 | 1630.465000 | 1.00000 | 40.000000 | 4.00000 | 0.000000 | 12.000000 | 0.000000 |
| 75% | 50.000000 | 6.000000 | 26.000000 | 7.000000 | 4.000000 | 15.000000 | 4.000000 | 1.000000 | 1.000000 | 2.00000 | 4.000000 | 6.00000 | 2212.007500 | 2.00000 | 40.000000 | 4.00000 | 0.000000 | 52.000000 | 0.000000 |
| max | 90.000000 | 10.000000 | 46.000000 | 9.000000 | 6.000000 | 24.000000 | 4.000000 | 3.000000 | 1.000000 | 4.00000 | 5.000000 | 7.00000 | 9462.550000 | 2.00000 | 42.000000 | 4.00000 | 2.000000 | 52.000000 | 1.000000 |
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(sample_df)
# Perform the elbow method to determine optimal k
inertia = []
K = range(1, 11)
for k in K:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
inertia.append(kmeans.inertia_)
# Plot the elbow method
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()
# Perform K-means clustering with optimal k
optimal_k = np.argmin(np.diff(inertia)) + 1
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X)
# Plot each feature distribution in clusters
for i in range(sample_df.shape[1]):
#plt.hist(X[clusters == 0, i], alpha=0.5, label='Cluster 0',color='red')
#plt.hist(X[clusters == 1, i], alpha=0.5, label='Cluster 1',color='yellow')
plt.hist(X[clusters == 2, i], alpha=0.5, label='Cluster 2',color='blue')
plt.xlabel('Standardized Value')
plt.ylabel('Count')
plt.title('Feature ' + sample_df.columns[i] + ' Distribution in Clusters')
plt.legend()
plt.show()
The clustering results showed that the 3 clusters had distinct characteristics. The first cluster consisted of mainly children,never married and non filer of tax having income less than 50,000. The second cluster included aged (18-60years) worker in private sector or attending high school , if married spouse are present and mostly not Hispanic. The third cluster was dominated by armed forces and unemployed individuals, also who were working less number of weeks in a year. Furthermore, it was observed that the clusters were similar in terms of citizenship and race.